library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.1.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.1.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.1.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(limma)
## Warning: package 'limma' was built under R version 4.1.1
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.2
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.1.2
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(data.table)
## Warning: package 'data.table' was built under R version 4.1.2
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(formattable)
## Warning: package 'formattable' was built under R version 4.1.2
##
## Attaching package: 'formattable'
## The following object is masked from 'package:plotly':
##
## style
#Data Files and prep work
source("../../../lib/DataProccess.R")
source("../../../lib/NormFuncs.R")
source("../../../lib/OutlierDetectionFuncs.R")
source("../../../lib/DataPathName.R")
BaseDir <- params$BaseDir#get the root of the directory where the data is stored
The steps to this analysis are: Remove outliers Remove trend normalize residual find constant variance
First we look at Madison data
LIMSFullDF <- ParseData(LIMSWastePath(BaseDir))%>%
filter(Site == "Madison")%>%
select(Date, Site, N1, BCoV , N2 , PMMoV,
FlowRate,temperature,equiv_sewage_amt)
ErrorMarkedDF <- LIMSFullDF%>%#
mutate(FlagedOutliers = IdentifyOutliers(N1, Action = "Flag"),
#Manual flagging that method misses due to boundary effect with binning
FlagedOutliers = ifelse(Date == mdy("01/26/2021") |
Date == mdy("01/27/2021") |
Date == mdy("01/23/2022"),
TRUE, FlagedOutliers),
NoOutlierVar = ifelse(FlagedOutliers, NA, N1))
#Split N1 into outlier and non outlier for next ggplot
OutLierPlotDF <- ErrorMarkedDF%>%
mutate(OutlierN1 = ifelse(FlagedOutliers,N1,NA))%>%
mutate(N1 := NoOutlierVar)
OutLierPlotObject <- OutLierPlotDF%>%
filter(!(is.na(N1)&is.na(OutlierN1)))%>%
ggplot(aes(x=Date))+#Data depends on time
geom_line(aes(y=N1,
color="N1",
info = N1))+#compares Var to Cases
geom_point(aes(y=OutlierN1,
color="OutlierN1",
info = OutlierN1))
## Warning: Ignoring unknown aesthetics: info
## Warning: Ignoring unknown aesthetics: info
ggplotly(OutLierPlotObject,tooltip=c("info","Date"))
UpdatedDF <- ErrorMarkedDF%>%
select(-N1)%>%
rename(N1 = NoOutlierVar)
UpdatedDF$loessN1 <- loessFit(y=(UpdatedDF$N1),
x=UpdatedDF$Date, #create loess fit of the data
span=.05, #span of .2 seems to give the best result, not rigorously chosen
iterations=2)$fitted#2 iterations remove some bad patterns
SLDLoessGraphic <- UpdatedDF%>%
NoNa("loessN1")%>%
ggplot(aes(x=Date))+
geom_line(aes(y=N1,
color="N1",
info = N1),
alpha=.2)+
geom_line(aes(y = loessN1,
color="loessN1" ,
info = loessN1))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(ColumnNames)` instead of `ColumnNames` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Warning: Ignoring unknown aesthetics: info
## Warning: Ignoring unknown aesthetics: info
ggplotly(SLDLoessGraphic,tooltip=c("info","Date"))
DetrendedDF <- UpdatedDF%>%
mutate(DetrendedN1 = N1 - loessN1)
DetrendLoessGraphic <- DetrendedDF%>%
NoNa("DetrendedN1")%>%
ggplot(aes(x=Date))+
geom_line(aes(y=DetrendedN1,
color="DetrendedN1",
info = DetrendedN1))
## Warning: Ignoring unknown aesthetics: info
ggplotly(DetrendLoessGraphic,tooltip=c("info","Date"))
mean(DetrendedDF$DetrendedN1, na.rm=TRUE)
## [1] 12278.31
NormRestruct <- function(Norm){
return(mean(Norm,na.rm=TRUE)/Norm)
}
TestNorm <- function(DF,Base,Norm){
ResidDetrendLoessGraphic <- ResidDetrendedDF%>%
NoNa(Norm)%>%
ggplot(aes(x=Date))+
geom_line(aes(y=!!sym(Base),
color=Base,
info = !!sym(Base)))+
geom_line(aes(y=!!sym(Norm),
color=Norm,
info = !!sym(Norm)))
ggplotly(ResidDetrendLoessGraphic,tooltip=c("info","Date"))
}
ResidDetrendedDF <- DetrendedDF%>%
mutate(VarCompN1 = DetrendedN1**2,
N1NormedResid = VarCompN1*NormRestruct(loessN1),
sqrtNormedResid = VarCompN1*NormRestruct(sqrt(loessN1)),
logNormedResid = VarCompN1*NormRestruct(log(loessN1)),
xlogNormedResid = VarCompN1*NormRestruct(loessN1*log(loessN1)),
WeirdNormedResid = VarCompN1*NormRestruct(sqrt(loessN1)*log(loessN1)))
ResidDetrendLoessGraphic <- ResidDetrendedDF%>%
NoNa("DetrendedN1")%>%
ggplot(aes(x=Date, y=DetrendedN1^2,
info = DetrendedN1))+
geom_point()+
geom_smooth(span = .1)
ggplotly(ResidDetrendLoessGraphic,tooltip=c("info","Date"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'